library ("readxl") # version 1.4.3
library ("ggplot2") # version 3.4.4
library ("gridExtra") # version 2.3
library ("dplyr") # version 1.1.2
library ("paletteer") # version 1.4.0
library ("tibble") # version 3.2.1
library ("tidyr") # version 1.3.0
library ("ALFAM2") # version 3.7
library ("randomForest") # version 4.7.1.1
library ("xgboost") # version 1.7.6.1
library ("glmnet") # version 4.1.8
library ("treeshap") # version 0.3.1
library ("shapviz") # version 0.9.3
library ("kableExtra") # version 1.3.4Machine Learning Approaches for Ammonia Volatilization Prediction after Manure Application
Data description
load (file = "scripts/processed_data/data_alfam2.Rdata")data_alfam2 %>% summarise (n = n(), .by = dataset) dataset n
1 Calibration subset 5515
2 Evaluation subset 424
plot_data_description = data_alfam2 %>%
filter (! (app.mthd == "bsth" & incorp == "shallow")) %>%
mutate (strategy = paste (app.mthd, incorp, sep = " - ")) %>%
mutate (strategy = recode (strategy, "bc - none" = "A", "bc - shallow" = "B", "bc - deep" = "C", "bsth - none" = "D", "os - none" = "E", "ts - none" = "F")) %>%
ggplot () +
geom_line (aes (x = time, y = e.rel, group = pmid), linewidth = 0.15) +
ylab ("Cumulative emission (frac. applied TAN)") +
xlab ("Time (h)") +
theme (axis.title.y = element_text (margin = ggplot2::margin (r = 25)),
strip.text.x = element_text(margin = ggplot2::margin(t = 8, b = 8, r = 0, l = 0))) +
# scale_color_manual (values = Dark2) +
facet_wrap (~ strategy)
plot_data_descriptiondata_alfam2 %>%
select (air.temp, wind.2m, rain.rate, rain.cum) %>%
summary air.temp wind.2m rain.rate rain.cum
Min. :-1.90 Min. : 0.05133 Min. :0.0000 Min. : 0.000
1st Qu.: 9.50 1st Qu.: 1.65665 1st Qu.:0.0000 1st Qu.: 0.000
Median :12.50 Median : 2.80000 Median :0.0000 Median : 0.000
Mean :13.28 Mean : 3.08744 Mean :0.0406 Mean : 1.131
3rd Qu.:16.85 3rd Qu.: 4.08000 3rd Qu.:0.0000 3rd Qu.: 0.400
Max. :35.24 Max. :16.78400 Max. :7.1000 Max. :55.900
data_alfam2 %>%
filter (time == max (time), .by = pmid) %>%
select (e.cum, time, tan.app, app.rate, man.dm, man.ph, t.incorp) %>%
summary e.cum time tan.app app.rate
Min. : 0.264 Min. :20.75 Min. : 10.90 Min. : 6.60
1st Qu.: 5.815 1st Qu.:48.16 1st Qu.: 36.66 1st Qu.: 18.73
Median : 10.968 Median :70.30 Median : 54.28 Median : 28.40
Mean : 16.162 Mean :62.13 Mean : 62.07 Mean : 29.76
3rd Qu.: 21.473 3rd Qu.:71.70 3rd Qu.: 80.30 3rd Qu.: 36.65
Max. :140.570 Max. :78.00 Max. :235.40 Max. :132.60
man.dm man.ph t.incorp
Min. : 1.000 Min. :6.40 Min. : 0.0000
1st Qu.: 3.790 1st Qu.:7.10 1st Qu.: 0.0000
Median : 6.755 Median :7.40 Median : 0.0833
Mean : 6.248 Mean :7.42 Mean : 1.1878
3rd Qu.: 8.150 3rd Qu.:7.70 3rd Qu.: 0.0833
Max. :13.600 Max. :8.50 Max. :48.0000
NA's :85 NA's :475
Part 1 - Model comparison
ALFAM2
Predictions of emissions using the ALFAM2 model.
# We use alfam2pars01 parameter without pH, like in Hafner et al, 2019
pars <- alfam2pars01[!grepl('man.ph', names(alfam2pars01))]
alfam2_predictions = alfam2 (
pars = pars,
dat = data_alfam2 %>% select (- j.NH3, - e.cum, - e.rel, - dataset, - country),
app.name = "tan.app",
time.name = "time",
time.incorp = "t.incorp",
group = "pmid",
prep = TRUE,
warn = FALSE
)We add the true values to the prediction dataframe and we keep only the last observation of each pmid.
alfam2_predictions = alfam2_predictions %>%
select (pmid, time, e, er) %>%
mutate (truth_e = data_alfam2$e.cum) %>%
mutate (truth_er = data_alfam2$e.rel) %>%
mutate (dataset = data_alfam2$dataset,
country = data_alfam2$country,
app.mthd = data_alfam2$app.mthd) %>%
select (pmid, time, e, er, truth_e, truth_er, dataset, country, app.mthd) %>%
filter (time == max (time), .by = pmid)Random forest
Predictions of emissions using random forest.
load (file = "scripts/processed_data/data_random_forest.Rdata")data_random_forest_calibration = data_random_forest %>%
filter (dataset == "Calibration subset") %>%
select (- pmid, - e.rel, - dataset, - country)
set.seed (123)
random_forest_model = randomForest (
e.cum ~ .,
data = data_random_forest_calibration,
importance = TRUE,
mtry = 19, nodesize = 3, replace = FALSE, sample_frac = 0.8
)plot (random_forest_model)random_forest_predictions = data_random_forest %>%
mutate (e.cum_hat = predict (
random_forest_model,
newdata = data_random_forest %>%
select (- pmid, - e.cum, - e.rel, - dataset, - country)
)
) %>%
mutate (e.rel_hat = e.cum_hat / tan.app) %>%
mutate (app.mthd = recode (as.character (app.mthd), "1" = "bc", "2" = "ts", "3" = "os", "4" = "bsth")) %>%
select (pmid, time, e = e.cum_hat, er = e.rel_hat, truth_e = e.cum, truth_er = e.rel, dataset, country, app.mthd) Xgboost
Predictions of emissions using gradient boosting.
load (file = "scripts/processed_data/data_xgboost.Rdata")set.seed (123)
xgboost_model = xgboost (
data = xgb.DMatrix (
data = data_xgboost %>%
filter (dataset == "Calibration subset") %>%
select (- pmid, - e.cum, - e.rel, - dataset, - country) %>%
as.matrix %>%
{.},
label = data_xgboost %>%
filter (dataset == "Calibration subset") %>%
select (e.cum) %>%
as.matrix %>%
{.}
),
max.depth = 6, nrounds = 300, eta = 0.3, min_child_weight = 0.5, subsample = 0.8,
verbose = FALSE,
objective = "reg:squarederror"
)xgboost_predictions = data_xgboost %>%
mutate (e.cum_hat = predict (
xgboost_model,
newdata = data_xgboost %>%
select (- pmid, - e.cum, - e.rel, - dataset, - country) %>%
as.matrix
)
) %>%
mutate (e.rel_hat = e.cum_hat / tan.app) %>%
mutate (app.mthd = recode (as.character (app.mthd), "1" = "bc", "2" = "ts", "3" = "os", "4" = "bsth")) %>%
select (pmid, time, e = e.cum_hat, er = e.rel_hat, truth_e = e.cum, truth_er = e.rel, dataset, country, app.mthd)Lasso
Predictions of emissions using Lasso regression.
load (file = "scripts/processed_data/data_lasso.Rdata")set.seed (123)
lasso_model = cv.glmnet (
x = data_lasso %>%
filter (dataset == "Calibration subset") %>%
select (- pmid, - e.cum, - e.rel, - dataset, - country) %>%
as.matrix %>%
{.},
y = data_lasso %>%
filter (dataset == "Calibration subset") %>%
select (e.cum) %>%
as.matrix %>%
{.},
alpha = 1
)lasso_predictions_vector = predict (
lasso_model,
data_lasso %>%
select (- pmid, - e.cum, - e.rel, - dataset, - country) %>%
as.matrix
)
df_join = data_alfam2 %>% select (pmid, app.mthd) %>% distinct
lasso_predictions = data_lasso %>%
left_join (df_join, by = "pmid") %>%
mutate (e.cum_hat = exp (lasso_predictions_vector), e.cum = exp (e.cum)) %>%
mutate (e.rel_hat = e.cum_hat / tan.app) %>%
select (pmid, time, e = e.cum_hat, er = e.rel_hat, truth_e = e.cum, truth_er = e.rel, dataset, country, app.mthd) Comparison
predictions_of_all_methods = rbind (
alfam2_predictions %>% mutate (method = "alfam2"),
random_forest_predictions %>% mutate (method = "random forest"),
xgboost_predictions %>% mutate (method = "xgboost"),
lasso_predictions %>% mutate (method = "lasso")
)predictions_of_all_methods %>% head %>% mutate_if (is.numeric, round, digits = 1) %>% kable| pmid | time | e | er | truth_e | truth_er | dataset | country | app.mthd | method |
|---|---|---|---|---|---|---|---|---|---|
| 182 | 44.8 | 21.1 | 0.2 | 11.8 | 0.1 | Calibration subset | DK | bc | alfam2 |
| 183 | 45.2 | 7.3 | 0.1 | 5.5 | 0.1 | Calibration subset | DK | bsth | alfam2 |
| 184 | 45.1 | 14.7 | 0.2 | 11.2 | 0.1 | Calibration subset | DK | bc | alfam2 |
| 185 | 45.5 | 7.9 | 0.1 | 7.3 | 0.1 | Calibration subset | DK | bsth | alfam2 |
| 186 | 43.9 | 14.8 | 0.3 | 9.1 | 0.2 | Calibration subset | DK | bc | alfam2 |
| 187 | 45.2 | 8.5 | 0.2 | 6.5 | 0.1 | Calibration subset | DK | bsth | alfam2 |
Residuals
Comparison of residual for the calibration and evaluation subsets.
plot_residuals_complete_dataset = predictions_of_all_methods %>%
mutate (residuals = truth_er - er) %>%
mutate (app.mthd = recode (app.mthd, "bc" = "Broadcast", "bsth" = "Trailing hoses", "ts" = "Trailing shoes", "os" = "Open slot injection")) %>%
mutate (app.mthd = factor (app.mthd, levels = c ("Broadcast", "Trailing hoses", "Trailing shoes", "Open slot injection"))) %>%
ggplot () +
geom_boxplot (aes (x = country, y = residuals, fill = method)) +
geom_hline (yintercept = 0, linetype = 2) +
scale_fill_manual (values = Dark2[c(2, 5, 6, 8)]) +
facet_wrap (~ app.mthd, ncol = 1) +
ylab ("Residual (frac. applied TAN)") +
labs (fill = "") +
theme (legend.position = "bottom",
axis.title.y = element_text (margin = ggplot2::margin (r = 30)),
strip.text.x = element_text(margin = ggplot2::margin(t = 8, b = 8, r = 0, l = 0)))
plot_residuals_complete_datasetComparison of residual for the evaluation subset.
plot_residuals_evaluation_subset = predictions_of_all_methods %>%
filter (dataset == "Evaluation subset") %>%
mutate (residuals = truth_er - er) %>%
mutate (app.mthd = recode (app.mthd, "bc" = "Broadcast", "bsth" = "Trailing hoses", "ts" = "Trailing shoes", "os" = "Open slot injection")) %>%
mutate (app.mthd = factor (app.mthd, levels = c ("Broadcast", "Trailing hoses", "Trailing shoes", "Open slot injection"))) %>%
ggplot () +
geom_boxplot (aes (x = country, y = residuals, fill = method), varwidth = TRUE) +
geom_hline (yintercept = 0, linetype = 2) +
scale_fill_manual (values = Dark2[c(2, 5, 6, 8)]) +
facet_wrap (~ app.mthd, ncol = 1) +
ylab ("Residual (frac. applied TAN)") +
labs (fill = "") +
theme (legend.position = "bottom",
axis.title.y = element_text (margin = ggplot2::margin (r = 30)),
strip.text.x = element_text(margin = ggplot2::margin(t = 8, b = 8, r = 0, l = 0)))
plot_residuals_evaluation_subsetObserved vs predicted values
plot_observed_vs_predicted_values = predictions_of_all_methods %>%
filter (dataset == "Evaluation subset") %>%
mutate (method = recode (method, "alfam2" = "A", "lasso" = "B", "random forest" = "C", "xgboost" = "D")) %>%
ggplot () +
geom_point (aes (x = truth_e, y = e, color = method), size = 3) +
geom_abline (slope = 1, linetype = "dashed") +
facet_wrap (~ method) +
scale_color_manual (values = Dark2[c(2, 5, 6, 8)]) +
labs (color = "") +
theme (legend.position = "none",
axis.title.y = element_text (margin = ggplot2::margin (r = 30)),
axis.title.x = element_text (margin = ggplot2::margin (t = 15, b = 20)),
strip.text.x = element_text(margin = ggplot2::margin(t = 8, b = 8, r = 0, l = 0))) +
xlab ("Observed values (kg/ha)") + ylab ("Predicted values (kg/ha)") +
NULL
plot_observed_vs_predicted_valuesEvaluation metrics
df_evaluation_metrics = rbind (
predictions_of_all_methods %>%
select (prediction = e, truth = truth_e, dataset, method) %>%
mutate (response = "72h cum. emission"),
predictions_of_all_methods %>%
select (prediction = er, truth = truth_er, dataset, method) %>%
mutate (response = "72h relative cum. emission")
) %>%
summarise (
MSE = mean ((prediction - truth) ^ 2),
RMSE = sqrt (mean ((prediction - truth) ^ 2)),
Pearsons_r = cor (prediction, truth),
ME = 1 - (sum ( (prediction - truth) ^ 2) / sum ( (truth - mean (truth)) ^ 2)),
MAE = mean (abs (prediction - truth)),
MBE = mean (prediction - truth),
.by = c (dataset, method, response)
) %>%
mutate_if (is.numeric, round, digits = 2)
df_evaluation_metrics %>% arrange (response, dataset) %>% kable ()| dataset | method | response | MSE | RMSE | Pearsons_r | ME | MAE | MBE |
|---|---|---|---|---|---|---|---|---|
| Calibration subset | alfam2 | 72h cum. emission | 113.86 | 10.67 | 0.82 | 0.61 | 6.64 | -2.98 |
| Calibration subset | random forest | 72h cum. emission | 12.36 | 3.52 | 0.98 | 0.96 | 2.21 | 0.14 |
| Calibration subset | xgboost | 72h cum. emission | 0.54 | 0.73 | 1.00 | 1.00 | 0.26 | 0.00 |
| Calibration subset | lasso | 72h cum. emission | 122.02 | 11.05 | 0.78 | 0.58 | 6.70 | -2.56 |
| Evaluation subset | alfam2 | 72h cum. emission | 65.29 | 8.08 | 0.86 | 0.62 | 5.65 | -2.81 |
| Evaluation subset | random forest | 72h cum. emission | 20.34 | 4.51 | 0.94 | 0.88 | 3.28 | 0.92 |
| Evaluation subset | xgboost | 72h cum. emission | 38.34 | 6.19 | 0.90 | 0.78 | 4.09 | 0.51 |
| Evaluation subset | lasso | 72h cum. emission | 53.32 | 7.30 | 0.84 | 0.69 | 5.57 | -1.38 |
| Calibration subset | alfam2 | 72h relative cum. emission | 0.03 | 0.18 | 0.80 | 0.53 | 0.12 | -0.06 |
| Calibration subset | random forest | 72h relative cum. emission | 0.00 | 0.05 | 0.98 | 0.96 | 0.04 | 0.01 |
| Calibration subset | xgboost | 72h relative cum. emission | 0.00 | 0.01 | 1.00 | 1.00 | 0.00 | 0.00 |
| Calibration subset | lasso | 72h relative cum. emission | 0.03 | 0.16 | 0.79 | 0.59 | 0.11 | -0.04 |
| Evaluation subset | alfam2 | 72h relative cum. emission | 0.02 | 0.15 | 0.79 | 0.58 | 0.11 | -0.04 |
| Evaluation subset | random forest | 72h relative cum. emission | 0.01 | 0.10 | 0.92 | 0.81 | 0.08 | 0.03 |
| Evaluation subset | xgboost | 72h relative cum. emission | 0.01 | 0.12 | 0.89 | 0.75 | 0.08 | 0.01 |
| Evaluation subset | lasso | 72h relative cum. emission | 0.02 | 0.16 | 0.77 | 0.55 | 0.12 | 0.00 |
df_evaluation_metrics %>%
filter (response == "72h cum. emission") %>%
select (- MSE, - RMSE, - Pearsons_r, - ME) %>%
pivot_longer (cols = c (MAE, MBE)) %>%
arrange (name, dataset) %>%
mutate (variation = (1 - abs(value) / abs(value [method == "alfam2"])) * 100, .by = c(name, dataset)) %>%
mutate (variation = round (variation, digits = 1)) %>%
mutate (value = round (value, digits = 1)) %>%
kable ()| dataset | method | response | name | value | variation |
|---|---|---|---|---|---|
| Calibration subset | alfam2 | 72h cum. emission | MAE | 6.6 | 0.0 |
| Calibration subset | random forest | 72h cum. emission | MAE | 2.2 | 66.7 |
| Calibration subset | xgboost | 72h cum. emission | MAE | 0.3 | 96.1 |
| Calibration subset | lasso | 72h cum. emission | MAE | 6.7 | -0.9 |
| Evaluation subset | alfam2 | 72h cum. emission | MAE | 5.7 | 0.0 |
| Evaluation subset | random forest | 72h cum. emission | MAE | 3.3 | 41.9 |
| Evaluation subset | xgboost | 72h cum. emission | MAE | 4.1 | 27.6 |
| Evaluation subset | lasso | 72h cum. emission | MAE | 5.6 | 1.4 |
| Calibration subset | alfam2 | 72h cum. emission | MBE | -3.0 | 0.0 |
| Calibration subset | random forest | 72h cum. emission | MBE | 0.1 | 95.3 |
| Calibration subset | xgboost | 72h cum. emission | MBE | 0.0 | 100.0 |
| Calibration subset | lasso | 72h cum. emission | MBE | -2.6 | 14.1 |
| Evaluation subset | alfam2 | 72h cum. emission | MBE | -2.8 | 0.0 |
| Evaluation subset | random forest | 72h cum. emission | MBE | 0.9 | 67.3 |
| Evaluation subset | xgboost | 72h cum. emission | MBE | 0.5 | 81.9 |
| Evaluation subset | lasso | 72h cum. emission | MBE | -1.4 | 50.9 |
# Evaluation on evalutation subset
df_plot = df_evaluation_metrics %>%
filter (response == "72h cum. emission") %>%
select (- MSE, - RMSE) %>%
filter (dataset == "Evaluation subset") %>%
pivot_longer (cols = c (Pearsons_r, ME, MAE, MBE)) %>%
mutate (name = factor (name, levels = c("Pearsons_r", "ME", "MAE", "MBE")))
suppressWarnings(ggplot (df_plot) +
geom_histogram (aes (x = name, y = value, fill = method), position = "dodge", stat = "identity") +
facet_wrap (~ name, scales = "free", nrow = 1) +
xlab ("") + ylab ("") + labs (fill = "") +
theme (legend.position = "none ",
strip.text.y = element_text(angle = 0),
axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
strip.text.x = element_text(margin = ggplot2::margin(t = 8, b = 8, r = 0, l = 0))) +
scale_fill_manual (values = Dark2[c(2, 5, 6, 8)]) +
ggtitle ("Evaluation subset") + theme (plot.title = element_text (margin= ggplot2::margin(0,0,20,0), hjust = 0, size = 30)))# Evaluation on calibration subset
df_plot = df_evaluation_metrics %>%
filter (response == "72h cum. emission") %>%
select (- MSE, - RMSE) %>%
filter (dataset == "Calibration subset") %>%
pivot_longer (cols = c (Pearsons_r, ME, MAE, MBE)) %>%
mutate (name = factor (name, levels = c("Pearsons_r", "ME", "MAE", "MBE")))
suppressWarnings(ggplot (df_plot) +
geom_histogram (aes (x = name, y = value, fill = method), position = "dodge", stat = "identity") +
facet_wrap (~ name, scales = "free", nrow = 1) +
xlab ("") + ylab ("") + labs (fill = "") +
theme (legend.position = "bottom",
strip.text.y = element_text(angle = 0),
axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
strip.text.x = element_text(margin = ggplot2::margin(t = 8, b = 8, r = 0, l = 0))) +
scale_fill_manual (values = Dark2[c(2, 5, 6, 8)]) +
ggtitle ("Calibration subset") + theme (plot.title = element_text (margin= ggplot2::margin(0,0,20,0), hjust = 0, size = 30)))Part 2 - New training with the whole dataset
Random forest
set.seed (123)
random_forest_model = randomForest (
e.cum ~ .,
data = data_random_forest %>% select (- pmid, - e.rel, - dataset, - country),
importance = TRUE,
mtry = 19, nodesize = 3, replace = FALSE, sample_frac = 0.8
)Shapley
df_tmp = data_random_forest_calibration %>% select (- e.cum)
unified_model_rf <- treeshap::randomForest.unify (random_forest_model, df_tmp)
treeshap_rf <- treeshap::treeshap (unified_model_rf, df_tmp, verbose = FALSE)
shapley_rf <- shapviz (treeshap_rf, X = data_random_forest_calibration %>% select (- e.cum))plot_shap_rf = shapviz::sv_importance (shapley_rf, kind = "beeswarm", max_display = 10L) +
scale_colour_gradient (low = "darkblue", high = "red", breaks = c(0, 1), labels = c("low", "high")) +
theme (
legend.title = element_text (size = 25, margin = ggplot2::margin (r = 50, b = 100)),
legend.position = "bottom",
legend.key.width = unit(3, "cm"),
axis.title.x = element_text (margin = ggplot2::margin( b = 30))
)
plot_shap_rfGain
df_importance_rf = randomForest::importance (random_forest_model) %>%
as.data.frame %>%
rownames_to_column (var = "Variable") %>%
mutate (`%IncMSE` = `%IncMSE` / max (`%IncMSE`)) %>%
mutate (type = "A")
plot_importance_rf = ggplot (df_importance_rf) +
geom_point (aes (x = `%IncMSE`, y = Variable), size = 5) +
scale_y_discrete (limits = df_importance_rf %>% arrange (`%IncMSE`) %>% pull (Variable)) +
theme (strip.text.x = element_text(margin = ggplot2::margin(t = 8, b = 8, r = 0, l = 0))) +
xlab ("Importance") +
ylab ("") +
facet_wrap (~ type)
plot_importance_rfdf_importance_rf %>%
arrange (desc (`%IncMSE`)) %>%
mutate_if (is.numeric, round, digits = 2) %>%
kable ()| Variable | %IncMSE | IncNodePurity | type |
|---|---|---|---|
| app.mthd | 1.00 | 18855.07 | A |
| tan.app | 0.61 | 18809.39 | A |
| man.ph | 0.55 | 13928.94 | A |
| incorp | 0.43 | 6180.64 | A |
| t.incorp | 0.33 | 4605.37 | A |
| man.dm | 0.33 | 5602.95 | A |
| app.rate | 0.24 | 4444.92 | A |
| temp_6 | 0.23 | 1241.42 | A |
| rain_6 | 0.21 | 1181.81 | A |
| rain_1 | 0.20 | 371.33 | A |
| temp_5 | 0.19 | 1166.82 | A |
| time | 0.19 | 2469.28 | A |
| temp_1 | 0.18 | 1138.24 | A |
| wind_2 | 0.17 | 1647.65 | A |
| temp_2 | 0.17 | 1546.45 | A |
| wind_1 | 0.15 | 2374.54 | A |
| wind_3 | 0.14 | 1335.06 | A |
| temp_4 | 0.14 | 1038.28 | A |
| rain_2 | 0.14 | 254.33 | A |
| rain_5 | 0.13 | 118.99 | A |
| wind_6 | 0.13 | 1701.91 | A |
| temp_3 | 0.12 | 1617.32 | A |
| rain_4 | 0.11 | 132.87 | A |
| man.source | 0.08 | 688.20 | A |
| rain_3 | 0.08 | 140.28 | A |
| wind_5 | 0.08 | 597.15 | A |
| wind_4 | 0.07 | 602.99 | A |
Xgboost
set.seed (123)
xgboost_model = xgboost (
data = xgb.DMatrix (
data = data_xgboost %>%
select (- pmid, - e.cum, - e.rel, - dataset, - country) %>%
as.matrix %>%
{.},
label = data_xgboost %>%
select (e.cum) %>%
as.matrix %>%
{.}
),
max.depth = 6, nrounds = 300, eta = 0.3, min_child_weight = 0.5, subsample = 0.8,
verbose = FALSE,
objective = "reg:squarederror"
)Shapley
df_tmp = data_xgboost %>% select (- pmid, - e.cum, - e.rel, - dataset, - country)
unified_model_xgboost <- treeshap::xgboost.unify (xgboost_model, as.matrix(df_tmp))
treeshap_xgboost <- treeshap::treeshap (unified_model_xgboost, df_tmp, verbose = FALSE)
shapley_xgb <- shapviz::shapviz (treeshap_xgboost, X = df_tmp)plot_shap_xgboost = shapviz::sv_importance (shapley_xgb, kind = "beeswarm", max_display = 10L) +
scale_colour_gradient (low = "darkblue", high = "red", breaks = c(0, 1), labels = c("low", "high")) +
theme (legend.title = element_text (size = 30, margin = ggplot2::margin (r = 50, b = 100)),
legend.position = "bottom",
legend.key.width = unit(3, "cm"),
axis.title.x = element_text (margin = ggplot2::margin( b = 30)))
plot_shap_xgboostGain
df_importance_xgb = xgboost::xgb.importance (model = xgboost_model) %>%
mutate (Gain = Gain / max (Gain)) %>%
mutate (type = "B")
plot_importance_xgboost = df_importance_xgb %>%
ggplot() +
geom_point (aes (x = Gain, y = Feature), size = 5) +
scale_y_discrete (limits = df_importance_xgb %>% arrange (Gain) %>% pull (Feature)) +
theme (strip.text.x = element_text(margin = ggplot2::margin(t = 8, b = 8, r = 0, l = 0))) +
xlab ("Importance") + ylab ("") +
facet_wrap (~ type)
plot_importance_xgboostdf_importance_xgb %>%
arrange (desc (Gain)) %>%
mutate_if (is.numeric, round, digits = 2) %>%
kable ()| Feature | Gain | Cover | Frequency | type |
|---|---|---|---|---|
| app.mthd | 1.00 | 0.01 | 0.02 | B |
| tan.app | 0.99 | 0.06 | 0.12 | B |
| man.ph | 0.75 | 0.12 | 0.06 | B |
| incorp | 0.44 | 0.00 | 0.02 | B |
| man.dm | 0.28 | 0.06 | 0.06 | B |
| app.rate | 0.16 | 0.03 | 0.05 | B |
| time | 0.14 | 0.18 | 0.24 | B |
| temp_1 | 0.13 | 0.05 | 0.04 | B |
| temp_2 | 0.07 | 0.05 | 0.04 | B |
| temp_4 | 0.06 | 0.01 | 0.01 | B |
| wind_2 | 0.06 | 0.04 | 0.03 | B |
| wind_3 | 0.06 | 0.01 | 0.02 | B |
| wind_6 | 0.05 | 0.04 | 0.04 | B |
| rain_6 | 0.05 | 0.01 | 0.03 | B |
| wind_1 | 0.05 | 0.09 | 0.05 | B |
| temp_6 | 0.04 | 0.02 | 0.03 | B |
| temp_3 | 0.03 | 0.02 | 0.02 | B |
| temp_5 | 0.02 | 0.01 | 0.01 | B |
| rain_4 | 0.02 | 0.06 | 0.02 | B |
| t.incorp | 0.01 | 0.00 | 0.00 | B |
| rain_1 | 0.01 | 0.06 | 0.03 | B |
| rain_2 | 0.01 | 0.01 | 0.02 | B |
| rain_5 | 0.01 | 0.01 | 0.00 | B |
| rain_3 | 0.01 | 0.01 | 0.01 | B |
| wind_4 | 0.01 | 0.01 | 0.01 | B |
| wind_5 | 0.00 | 0.01 | 0.01 | B |
| man.source | 0.00 | 0.00 | 0.00 | B |
Lasso
set.seed (123)
lasso_model = cv.glmnet (
x = data_lasso %>%
select (- pmid, - e.cum, - e.rel, - dataset, - country) %>%
as.matrix %>%
{.},
y = data_lasso %>%
select (e.cum) %>%
as.matrix %>%
{.},
alpha = 1
)Part 3 - Testing scenarios
load (file = "scripts/processed_data/data_scenarios.Rdata")
data_scenarios = data_scenarios %>%
mutate (t.incorp = as.factor (t.incorp)) %>%
mutate (t.incorp = recode (t.incorp, "1000" = "-"))Random forest
load (file = "scripts/processed_data/data_scenarios_random_forest.Rdata")
data_tmp = data_scenarios_random_forest
random_forest_predictions_scenarios_vector = predict (
random_forest_model,
newdata = rbind (data_random_forest_calibration[1, -1], data_tmp)[- 1, ]
)
data_scenarios = data_scenarios %>%
mutate (e.cum_hat_random_forest = random_forest_predictions_scenarios_vector, .before = time) %>%
mutate (efficacy_random_forest = ((e.cum_hat_random_forest / e.cum_hat_random_forest[app.mthd == "bc" & incorp == "none" & t.incorp == "-"]) - 1) * 100,
.by = c (tan.app, app.rate, man.dm, man.ph, man.source, group_temp, group_wind, group_rain)) %>%
{.}Xgboost
load (file = "scripts/processed_data/data_scenarios_xgboost.Rdata")
xgboost_predictions_scenarios_vector = predict (
xgboost_model,
data_scenarios_xgboost %>% as.matrix
)
data_scenarios = data_scenarios %>%
mutate (e.cum_hat_xgboost = xgboost_predictions_scenarios_vector, .before = time) %>%
mutate (efficacy_xgboost = ((e.cum_hat_xgboost / e.cum_hat_xgboost[app.mthd == "bc" & incorp == "none" & t.incorp == "-"]) - 1) * 100,
.by = c (tan.app, app.rate, man.dm, man.ph, man.source, group_temp, group_wind, group_rain))Lasso
load (file = "scripts/processed_data/data_scenarios_lasso.Rdata")
lasso_predictions_scenarios_vector = predict (
lasso_model,
data_scenarios_lasso %>% as.matrix
)
data_scenarios = data_scenarios %>%
mutate (e.cum_hat_lasso = lasso_predictions_scenarios_vector, .before = time) %>%
mutate (efficacy_lasso = ((e.cum_hat_lasso / e.cum_hat_lasso[app.mthd == "bc" & incorp == "none"]) - 1) * 100,
.by = c (tan.app, app.rate, man.dm, man.ph, man.source, group_temp, group_wind, group_rain))ALFAM2
load (file = "scripts/processed_data/data_scenarios_alfam2.Rdata")
alfam2_predictions_scenarios_vector = alfam2 (
pars = alfam2pars01,
dat = data_scenarios_alfam2,
app.name = "tan.app",
time.name = "time",
time.incorp = "t.incorp",
group = "pmid",
prep = TRUE,
warn = FALSE
) %>%
filter (time == max (time), .by = pmid) %>%
select (pmid, e.cum_hat_alfam2 = e)
data_scenarios = data_scenarios %>%
left_join (alfam2_predictions_scenarios_vector, by = "pmid") %>%
mutate (efficacy_alfam2 = ((e.cum_hat_alfam2 / e.cum_hat_alfam2[app.mthd == "bc" & incorp == "none" & t.incorp == "-"]) - 1) * 100,
.by = c (tan.app, app.rate, man.dm, man.ph, man.source, group_temp, group_wind, group_rain))Comparison
plot_scenario_predictions = data_scenarios %>%
select (app.mthd, incorp, t.incorp, man.ph,
efficacy_random_forest, efficacy_xgboost, efficacy_lasso, efficacy_alfam2) %>%
rename (efficacy_rforest = efficacy_random_forest) %>%
pivot_longer (cols = c (5 : 8), names_to = c (".value", "method"), names_pattern = "(.+)_(.+)") %>%
mutate (method = recode (method, "rforest" = "random forest")) %>%
mutate (method = recode (method, "alfam2" = "A", "lasso" = "B", "random forest" = "C", "xgboost" = "D")) %>%
mutate (group = paste (app.mthd, incorp, t.incorp)) %>%
filter (! (app.mthd == "bc" & incorp == "none")) %>%
mutate (group = recode (group,
"bc shallow 0" = "incorporation",
"bsth none -" = "trailing hoses",
"os none -" = "open slot",
"ts none -" = "trailing shoes")) %>%
ggplot () +
geom_boxplot (aes (x = efficacy, y = group, fill = group)) +
scale_fill_manual (values = Dark2[c(2 : 5)]) +
scale_y_discrete (limits = c ("open slot", "trailing shoes", "trailing hoses", "incorporation")) +
ylab ("") + xlab ("Efficacy") +
theme (legend.position = "none",
axis.ticks.y = element_blank(),
strip.text.x = element_text(margin = ggplot2::margin(t = 8, b = 8, r = 0, l = 0))) +
facet_wrap (~ method, ncol = 1)
plot_scenario_predictionsScenarios for which xgboost predicted a positive efficacy:
data_scenarios %>%
filter (efficacy_xgboost > 0) %>%
select (efficacy_xgboost, tan.app, app.mthd, app.rate, man.source, incorp, group_temp, group_wind, group_rain) %>%
kable ()| efficacy_xgboost | tan.app | app.mthd | app.rate | man.source | incorp | group_temp | group_wind | group_rain |
|---|---|---|---|---|---|---|---|---|
| 8.9906385 | 80.3025 | ts | 36.650 | pig | none | q1 | q1 | q1 |
| 9.0057694 | 80.3025 | ts | 36.650 | cat | none | q1 | q1 | q1 |
| 0.5613908 | 80.3025 | bsth | 18.725 | pig | none | q1 | q1 | q2 |
| 0.5621355 | 80.3025 | bsth | 18.725 | cat | none | q1 | q1 | q2 |
| 25.0445690 | 80.3025 | bsth | 36.650 | pig | none | q1 | q1 | q2 |
| 18.2393264 | 80.3025 | os | 36.650 | pig | none | q1 | q1 | q2 |
| 32.8066167 | 80.3025 | ts | 36.650 | pig | none | q1 | q1 | q2 |
| 25.0855518 | 80.3025 | bsth | 36.650 | cat | none | q1 | q1 | q2 |
| 18.2691620 | 80.3025 | os | 36.650 | cat | none | q1 | q1 | q2 |
| 32.8602812 | 80.3025 | ts | 36.650 | cat | none | q1 | q1 | q2 |
| 30.7067041 | 80.3025 | bsth | 36.650 | pig | none | q1 | q2 | q2 |
| 23.4712954 | 80.3025 | os | 36.650 | pig | none | q1 | q2 | q2 |
| 25.4853738 | 80.3025 | ts | 36.650 | pig | none | q1 | q2 | q2 |
| 30.7704933 | 80.3025 | bsth | 36.650 | cat | none | q1 | q2 | q2 |
| 23.5200540 | 80.3025 | os | 36.650 | cat | none | q1 | q2 | q2 |
| 25.5383165 | 80.3025 | ts | 36.650 | cat | none | q1 | q2 | q2 |
| 4.5797139 | 36.6595 | ts | 36.650 | pig | none | q2 | q1 | q1 |
| 4.5901176 | 36.6595 | ts | 36.650 | cat | none | q2 | q1 | q1 |
| 11.1164005 | 80.3025 | ts | 36.650 | pig | none | q2 | q1 | q1 |
| 11.4099749 | 80.3025 | ts | 36.650 | cat | none | q2 | q1 | q1 |
| 9.3674813 | 36.6595 | ts | 36.650 | pig | none | q2 | q1 | q2 |
| 9.3912529 | 36.6595 | ts | 36.650 | cat | none | q2 | q1 | q2 |
| 10.4259527 | 80.3025 | bsth | 18.725 | pig | none | q2 | q1 | q2 |
| 7.6296711 | 80.3025 | os | 18.725 | pig | none | q2 | q1 | q2 |
| 14.5180492 | 80.3025 | ts | 18.725 | pig | none | q2 | q1 | q2 |
| 10.6716109 | 80.3025 | bsth | 18.725 | cat | none | q2 | q1 | q2 |
| 7.8093910 | 80.3025 | os | 18.725 | cat | none | q2 | q1 | q2 |
| 14.8600869 | 80.3025 | ts | 18.725 | cat | none | q2 | q1 | q2 |
| 15.0486933 | 80.3025 | bsth | 36.650 | pig | none | q2 | q1 | q2 |
| 11.4423307 | 80.3025 | os | 36.650 | pig | none | q2 | q1 | q2 |
| 32.0733336 | 80.3025 | ts | 36.650 | pig | none | q2 | q1 | q2 |
| 15.4434423 | 80.3025 | bsth | 36.650 | cat | none | q2 | q1 | q2 |
| 11.7424345 | 80.3025 | os | 36.650 | cat | none | q2 | q1 | q2 |
| 32.9145756 | 80.3025 | ts | 36.650 | cat | none | q2 | q1 | q2 |
| 8.9087332 | 80.3025 | ts | 36.650 | pig | none | q2 | q2 | q1 |
| 9.0807629 | 80.3025 | ts | 36.650 | cat | none | q2 | q2 | q1 |
| 2.5264251 | 80.3025 | bsth | 18.725 | pig | none | q2 | q2 | q2 |
| 0.7573102 | 80.3025 | os | 18.725 | pig | none | q2 | q2 | q2 |
| 2.5725532 | 80.3025 | bsth | 18.725 | cat | none | q2 | q2 | q2 |
| 0.7710910 | 80.3025 | os | 18.725 | cat | none | q2 | q2 | q2 |
| 30.5988491 | 80.3025 | bsth | 36.650 | pig | none | q2 | q2 | q2 |
| 27.8218260 | 80.3025 | os | 36.650 | pig | none | q2 | q2 | q2 |
| 33.8994959 | 80.3025 | ts | 36.650 | pig | none | q2 | q2 | q2 |
| 31.3037845 | 80.3025 | bsth | 36.650 | cat | none | q2 | q2 | q2 |
| 28.4628769 | 80.3025 | os | 36.650 | cat | none | q2 | q2 | q2 |
| 34.6804699 | 80.3025 | ts | 36.650 | cat | none | q2 | q2 | q2 |
data_scenarios %>%
filter (efficacy_xgboost > 0) %>%
select (tan.app, app.mthd, app.rate, man.source, incorp, group_temp, group_wind, group_rain) %>%
lapply (table)$tan.app
36.6595 80.3025
4 42
$app.mthd
bsth os ts
14 12 20
$app.rate
18.725 36.65
12 34
$man.source
cat pig
23 23
$incorp
none
46
$group_temp
q1 q2
16 30
$group_wind
q1 q2
28 18
$group_rain
q1 q2
8 38
data_tmp = data_scenarios %>%
filter (efficacy_xgboost > 0) %>%
select (efficacy_xgboost, tan.app, app.mthd, app.rate, man.source, incorp, group_temp, group_wind, group_rain) %>%
mutate (group_rain = ifelse (group_rain == "q1", "low", "high")) %>%
mutate (tan.app = paste ("TAN = ", round (tan.app, digits = 1), " kg/ha", sep = ""),
app.rate = paste ("Application rate = ", round (app.rate, digits = 1), " t/ha or m3/ha", sep = ""),
group_rain = paste ("Rain = ", group_rain))
plot = ggplot (data_tmp) +
geom_histogram (aes (efficacy_xgboost)) +
facet_wrap (~ tan.app + app.rate + group_rain) +
xlab ("Efficacy") +
ylab ("Number of predictions") +
theme (axis.title.y = element_text (margin = ggplot2::margin (r = 20), size = 18),
strip.text.x = element_text(margin = ggplot2::margin(t = 8, b = 8, r = 0, l = 0)),
strip.text = element_text (size = 18),
axis.text = element_text (size = 18),
axis.title.x = element_text (size = 18))
plotdata_scenarios %>%
filter (! (app.mthd == "bc" & incorp == "none")) %>%
summarise (mean_rf = mean (efficacy_random_forest),
mean_alfam2 = mean (efficacy_alfam2),
mean_gbf = mean (efficacy_xgboost),
mean_lasso = mean (efficacy_lasso), .by = c ('app.mthd')) %>%
mutate_if (is.numeric, round, digits = 1) %>%
kable()| app.mthd | mean_rf | mean_alfam2 | mean_gbf | mean_lasso |
|---|---|---|---|---|
| bsth | -52.1 | -34.3 | -38.6 | -23.7 |
| os | -52.8 | -61.8 | -41.9 | -42.5 |
| ts | -50.9 | -34.3 | -35.0 | -23.7 |
| bc | -16.6 | -43.1 | -34.7 | -22.0 |
data_scenarios %>%
filter (! (app.mthd == "bc" & incorp == "none")) %>%
summarise (median_rf = median (efficacy_random_forest),
median_alfam2 = median (efficacy_alfam2),
median_gbf = median (efficacy_xgboost),
median_lasso = median (efficacy_lasso), .by = c ('app.mthd')) %>%
mutate_if (is.numeric, round, digits = 1) %>%
kable()| app.mthd | median_rf | median_alfam2 | median_gbf | median_lasso |
|---|---|---|---|---|
| bsth | -53.5 | -34.1 | -41.2 | -23.4 |
| os | -54.1 | -61.7 | -42.6 | -42.0 |
| ts | -52.6 | -34.1 | -35.4 | -23.4 |
| bc | -13.6 | -43.7 | -34.6 | -21.7 |
Figures
legend = cowplot::get_legend (plot_shap_rf)
plot_shap_rf_2 = plot_shap_rf +
annotate (geom = "label", label = "A", size = 22, fontface = "bold", x = 45, y = 1.5, fill = "#000080", color = "white") +
xlim (c (- 18, 45)) +
theme (legend.position = "none",
plot.margin = unit (c (0, 0, 1, 0), "cm"),
axis.text.x = element_blank(),
axis.title.x = element_blank(),
axis.ticks.x = element_blank())
plot_shap_xgboost_2 = plot_shap_xgboost +
annotate (geom = "label", label = "B", size = 22, fontface = "bold", x = 45, y = 1.5, fill = "#000080", color = "white") +
xlim (c (- 18, 45)) +
theme (legend.position = "none",
plot.margin = unit (c (0, 0, 1, 0), "cm"),
axis.title.x = element_text (margin = ggplot2::margin (t = 10, b = 20)))
png (file = "figures/plot_shap.png", width = 1000, height = 900)
do.call ("grid.arrange",
c (list(plot_shap_rf_2,
plot_shap_xgboost_2,
legend),
list(layout_matrix = as.matrix(c (1, 2, 3)),
heights = c (0.8, 1, 0.2))))
dev.off()png
2
png (file = "figures/plot_importance.png", width = 1300, height = 800)
grid.arrange (plot_importance_rf, plot_importance_xgboost, nrow = 1)
dev.off()png
2
Supplementary material
data = read_excel("data/Peng_Xu_et_al_2024/Supplementary_Table_3_use2775.xlsx", sheet = 2)head (data) %>% select (- Title) %>% kable()| No. | Author | Language | Year | Replicates | Fertilizer type | Nitrogen placement | Fertilizer application time | Soil tillage practices | Crop type | Water input | Tem | SOC | TN | pH | BD | Clay | CEC | Nrate | Erate |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | Lin et al. (2015) | Chinese | 2013 | 3 | U | SBC | 1 | CT | Rice | 1289 | 26.0 | 5.4 | 0.6 | 7.9 | 1.3 | 20.0 | 18.0 | 113 | 22.1 |
| 1 | Lin et al. (2015) | Chinese | 2013 | 3 | U | SBC | 1 | CT | Rice | 1289 | 26.0 | 5.4 | 0.6 | 7.9 | 1.3 | 20.0 | 18.0 | 150 | 32.7 |
| 1 | Lin et al. (2015) | Chinese | 2013 | 3 | U | SBC | 1 | CT | Rice | 1289 | 26.0 | 5.4 | 0.6 | 7.9 | 1.3 | 20.0 | 18.0 | 188 | 35.9 |
| 1 | Lin et al. (2015) | Chinese | 2013 | 3 | U | SBC | 1 | CT | Rice | 1289 | 26.0 | 5.4 | 0.6 | 7.9 | 1.3 | 20.0 | 18.0 | 225 | 44.6 |
| 1 | Lin et al. (2015) | Chinese | 2013 | 3 | U | SBC | 1 | CT | Rice | 1289 | 26.0 | 5.4 | 0.6 | 7.9 | 1.3 | 20.0 | 18.0 | 300 | 63.1 |
| 2 | Zhu et al. (2013) | Chinese | 2012 | 3 | Others | Mix | 2 | CT | Rice | 1500 | 25.1 | 21.9 | 1.9 | 5.8 | 1.1 | 38.3 | 11.4 | 113 | 38.3 |
plot_temperature_comparison = rbind (
data %>% select (temp = Tem) %>% mutate (data = "Data from Peng Xu et al (2024)"),
data_alfam2 %>% select (temp = air.temp) %>% mutate (data = "Data used for the alfam2 model")
) %>%
ggplot () +
geom_histogram (aes (x = temp)) +
facet_wrap (~ data, ncol = 1, scales = "free") +
theme (strip.text.x = element_text(margin = ggplot2::margin(t = 8, b = 8, r = 0, l = 0)))+
xlab ("Air temperature (°C)") +
ylab ("Number of observations")
plot_temperature_comparisonrbind (
data %>% select (temp = Tem) %>% mutate (data = "Data from Peng Xu et al (2024)"),
data_alfam2 %>% select (temp = air.temp) %>% mutate (data = "Data used for the alfam2 model")
) %>%
summarise (mean_temp = mean (temp), sd_temp = sd (temp), .by = data)# A tibble: 2 × 3
data mean_temp sd_temp
<chr> <dbl> <dbl>
1 Data from Peng Xu et al (2024) 20.7 6.86
2 Data used for the alfam2 model 13.3 5.48